S-FFSD.csv¶
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # For easier feature plotting
import re
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # Set font to Microsoft YaHei for Chinese characters
plt.rcParams['axes.unicode_minus'] = False # Ensure minus sign displays correctly
try:
df = pd.read_csv('./data/S-FFSD.csv')
print("S-FFSD.csv loaded successfully!")
except FileNotFoundError:
print("Error: S-FFSD.csv not found. Make sure the file is in './data/'.")
exit()
df['Source_Num'] = df['Source'].apply(lambda x: int(re.search(r'\d+', x).group()))
df['Target_Num'] = df['Target'].apply(lambda x: int(re.search(r'\d+', x).group()))
type_counts = df['Type'].value_counts()
total_count = len(df)
type_percentages = type_counts / total_count
df['Type'] = df['Type'].apply(lambda x: 'other' if type_percentages[x] < 0.05 else x)
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
sns.histplot(df['Source_Num'], bins=20, kde=False, ax=axes[0, 0])
axes[0, 0].set_title('Sourceę°å¼éØåēååøē“ę¹å¾-åé”å', fontsize=16)
axes[0, 0].set_xlabel('ę°å¼', fontsize=12)
axes[0, 0].set_ylabel('é¢ę¬”', fontsize=12)
sns.histplot(df['Target_Num'], bins=20, kde=False, ax=axes[0, 1])
axes[0, 1].set_title('Targetę°å¼éØåēååøē“ę¹å¾-åé”å', fontsize=16)
axes[0, 1].set_xlabel('ę°å¼', fontsize=12)
axes[0, 1].set_ylabel('é¢ę¬”', fontsize=12)
amount_bins = pd.qcut(df['Amount'], q=5)
amount_bin_counts = amount_bins.value_counts()
amount_bin_counts.plot(kind='pie', autopct='%1.1f%%', ax=axes[1, 0])
axes[1, 0].set_title('Amountēååøé„¼å¾-åé”å')
axes[1, 0].set_ylabel('')
type_counts_updated = df['Type'].value_counts()
type_counts_updated.plot(kind='pie', autopct='%1.1f%%', ax=axes[1, 1])
axes[1, 1].set_title('Typeēååøé„¼å¾-åé”å')
axes[1, 1].set_ylabel('')
plt.figure(figsize=(8, 8))
labels_for_plot = df['Labels'].map({0: 'é欺čÆ', 1: '欺čÆ', 2: 'ęŖę 注'})
labels_counts = labels_for_plot.value_counts()
labels_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=sns.color_palette("pastel"),
pctdistance=0.85, wedgeprops=dict(width=0.4)) # Add styling for a donut chart
plt.title('Labelsēååøé„¼å¾-åé”å', fontsize=16)
plt.tight_layout()
plt.show() # Display the first figure
# --- New: Add pie chart for 'Labels' ---
S-FFSD.csv loaded successfully!
YelpChi.mat¶
äøē§č¾¹ē±»åļ¼å ³ē³»ē©éµļ¼ļ¼ R-U-Rļ¼åäøēØę·ååøēčÆč®ŗļ¼ R-S-Rļ¼å ·ęēøåęēŗ§čÆåēåäøäŗ§åäøēčÆč®ŗļ¼ R-T-Rļ¼åØåäøäŗ§åäøåäøę份ååøēčÆč®ŗļ¼ ļ¼ęØŖēŗµč½“ęÆčÆč®ŗļ¼ homoęÆäøč ēåå¹¶ē©éµ
featuresęÆē¹å¾ē©éµļ¼labelęÆ0ę1蔨示欺čÆäøå¦
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# Load the .mat file
try:
yelp_data = scipy.io.loadmat('./data/YelpChi.mat')
print("Data loaded successfully!")
print("Available keys:", yelp_data.keys())
except FileNotFoundError:
print("Error: YelpChi.mat not found.")
exit()
homo_matrix = yelp_data['homo']
net_rur_matrix = yelp_data['net_rur']
net_rsr_matrix = yelp_data['net_rsr']
net_rtr_matrix = yelp_data['net_rtr']
features_matrix = yelp_data['features']
labels = yelp_data['label'].flatten() # Ensure labels are a 1D arraylabels are a 1D array
Data loaded successfully! Available keys: dict_keys(['__header__', '__version__', '__globals__', 'homo', 'net_rur', 'net_rtr', 'net_rsr', 'features', 'label'])
å ³ē³»ē©éµēēåå¾-YelpChi.mat¶
def plot_relationship_heatmap(matrix, title, max_size=100):
# Convert sparse matrix to dense and select a small block for visualization
if hasattr(matrix, "toarray"):
matrix = matrix[:max_size, :max_size].toarray()
else:
matrix = matrix[:max_size, :max_size]
plt.figure(figsize=(9, 8))
# Use 'binary' cmap if the matrix contains only 0s and 1s, otherwise 'viridis'
cmap = 'binary' if np.all(np.isin(matrix, [0, 1])) else 'viridis'
sns.heatmap(matrix, cmap=cmap, cbar_kws={'label': 'Connection (1) / No Connection (0)'})
plt.title(title + f" (Top-left {max_size}x{max_size})-åé”å", fontsize=16)
plt.xlabel("Review Index", fontsize=12)
plt.ylabel("Review Index", fontsize=12)
plt.tight_layout()
plt.show()
plot_relationship_heatmap(net_rur_matrix, 'R-U-R Matrix (Same User Reviews)')
plot_relationship_heatmap(net_rsr_matrix, 'R-S-R Matrix (Same Star Rating, Same Product)')
plot_relationship_heatmap(net_rtr_matrix, 'R-T-R Matrix (Same Month, Same Product)')
plot_relationship_heatmap(homo_matrix, 'Homogeneous Matrix (Combined Relationships)')
ē“ę¹å¾ēå½¢ē¶-YelpChi.mat¶
ä¼åčÆęØå¤§å¤ę°čÆč®ŗäøå ¶ä»čÆč®ŗēčæę„ę°éęÆå¤å°ćä¾å¦ļ¼å¦ęē“ę¹å¾éäøåØč¾ä½ēčæę„ę°ļ¼č”Øē¤ŗå¤§å¤ę°čÆč®ŗåŖäøå°ę°å ¶ä»čÆč®ŗęå ³ē³»ć é«čæę„čÆč®ŗēęÆä¾ļ¼ å¦ęē“ę¹å¾ēå°¾éØč¾éæęååØå¤äøŖå³°å¼ļ¼åÆč½č”Øē¤ŗęäøéØåčÆč®ŗäøå ¶ä»čÆč®ŗęēå¼åøøå¤ēčæę„ćåØę¬ŗčÆę£ęµäøļ¼äøäøŖčÆč®ŗäøéåøøå¤ēå ¶ä»čÆč®ŗļ¼å°¤å ¶ęÆéčæåäøēØę·ćēøåęēŗ§ęēøåę¶é“ååøļ¼ęå ³ē³»ļ¼åÆč½ęÆę¬ŗčÆå¢ä¼ę“»åØē迹豔ć äøåå ³ē³»ē±»åēē¹å¾ļ¼ ęÆč¾ R-U-RćR-S-RćR-T-R å Homogeneous ē©éµēčæę„ę°ē“ę¹å¾ļ¼ęØåÆä»„ēåŗäøåē±»åēå ³ē³»åØčÆč®ŗē½ē»äøēåÆåŗ¦åååøē¹å¾ćä¾å¦ļ¼R-U-R åÆč½ę¾ē¤ŗęäŗēØę·ååøäŗå¤§éčÆč®ŗļ¼åƼč“čæäŗčÆč®ŗä¹é“ęå¾é«ēčæę„ę°ć
def plot_connection_counts_histogram(matrix, title):
if hasattr(matrix, "toarray"):
dense_matrix = matrix.toarray()
else:
dense_matrix = matrix
connection_counts = dense_matrix.sum(axis=1) # Sum across rows
plt.figure(figsize=(10, 6))
sns.histplot(connection_counts, bins=30, kde=True, color='skyblue')
plt.title(f'Histogram of Connection Counts in {title}-åé”å', fontsize=16)
plt.xlabel("Number of Connections per Review", fontsize=12)
plt.ylabel("Number of Reviews", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
plot_connection_counts_histogram(net_rur_matrix, 'R-U-R Matrix')
plot_connection_counts_histogram(net_rsr_matrix, 'R-S-R Matrix')
plot_connection_counts_histogram(net_rtr_matrix, 'R-T-R Matrix')
plot_connection_counts_histogram(homo_matrix, 'Homogeneous Matrix')
é结ē¹å¾ę°ę®-YelpChi.mat¶
é«ē»“ē¹å¾ę°ę®ęå½±å°äŗē»“空é“ļ¼å¹¶åÆč§åčæäŗé结åēę°ę®ē¹ļ¼åę¶ę ¹ę®čÆč®ŗē欺čÆę ē¾ļ¼labelsļ¼čæč”ēč²ć
čæę ·åęÆäøŗäŗåØę ę³ē“ę„åÆč§åå¤äŗ3结ę°ę®ę¶ļ¼åø®å©ę们č§åÆę¬ŗčÆčÆč®ŗåé欺čÆčÆč®ŗåØē¹å¾ē©ŗé“äøęÆå¦ååØåÆå离ē樔å¼ęčē±»ć
å¦ęäøåē±»å«ēē¹åØé结åē空é“äøč½ęę¾å°čęäøåēē°ļ¼é£å°±č”Øęčæäŗē¹å¾åƹäŗåŗå欺čÆåé欺čÆč”äøŗęÆęęēć
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
print("\n--- Visualizing Features with Dimensionality Reduction ---")
if features_matrix.shape[1] > 2:
svd = TruncatedSVD(n_components=2, random_state=42)
features_svd = svd.fit_transform(features_matrix)
plt.figure(figsize=(10, 8))
sns.scatterplot(x=features_svd[:, 0], y=features_svd[:, 1], hue=labels, palette='coolwarm', alpha=0.7, s=30)
plt.title('TruncatedSVD of Features, Colored by Fraud Label-åé”å', fontsize=16)
plt.xlabel(f'SVD Component 1 ({svd.explained_variance_ratio_[0]*100:.2f}% variance)', fontsize=12)
plt.ylabel(f'SVD Component 2 ({svd.explained_variance_ratio_[1]*100:.2f}% variance)', fontsize=12)
plt.legend(title='Fraud Label', loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
else:
print("Not enough features for SVD (less than 2 dimensions).")
if features_matrix.shape[0] > 1000 and features_matrix.shape[1] > 1:
print("\nDataset is large. t-SNE can be slow. Running on a sampled subset.")
sample_size = 5000 # You can adjust this value (e.g., 2000, 3000, 4000, 5000)
if features_matrix.shape[0] > sample_size:
sample_indices = np.random.choice(features_matrix.shape[0], sample_size, replace=False)
features_sample = features_matrix[sample_indices]
labels_sample = labels[sample_indices]
else:
features_sample = features_matrix
labels_sample = labels
print(f"Applying t-SNE on a sample of {features_sample.shape[0]} reviews...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_jobs=-1) # Adjust perplexity (often 5-50)
features_sample_dense = features_sample.toarray()
features_tsne = tsne.fit_transform(features_sample_dense)
current_labels = labels_sample
plt.figure(figsize=(10, 8))
sns.scatterplot(x=features_tsne[:, 0], y=features_tsne[:, 1], hue=current_labels, palette='coolwarm', alpha=0.7, s=30)
plt.title('t-SNE of Features, Colored by Fraud Label-åé”å', fontsize=16) # Add "Sampled Data" to title
plt.xlabel('t-SNE Component 1', fontsize=12)
plt.ylabel('t-SNE Component 2', fontsize=12)
plt.legend(title='Fraud Label', loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
else:
print("Not enough data points or features for meaningful t-SNE visualization.")
--- Visualizing Features with Dimensionality Reduction ---
Dataset is large. t-SNE can be slow. Running on a sampled subset. Applying t-SNE on a sample of 5000 reviews...
Amazon.mat¶
try:
amazon_data = scipy.io.loadmat('./data/Amazon.mat')
print("\nAmazon Data loaded successfully!")
print("Available keys for Amazon:", amazon_data.keys())
except FileNotFoundError:
print("Error: Amazon.mat not found. Make sure the file is in './data/'.")
exit()
homo_matrix_amazon = amazon_data['homo']
net_upu_matrix_amazon = amazon_data['net_upu']
net_usu_matrix_amazon = amazon_data['net_usu']
net_uvu_matrix_amazon = amazon_data['net_uvu']
features_matrix_amazon = amazon_data['features']
labels_amazon = amazon_data['label'].flatten()
Amazon Data loaded successfully! Available keys for Amazon: dict_keys(['__header__', '__version__', '__globals__', 'homo', 'net_upu', 'net_usu', 'net_uvu', 'features', 'label'])
print("\n--- Visualizing Relationship Matrices (Amazon.mat) ---")
plot_relationship_heatmap(net_upu_matrix_amazon, 'U-P-U Matrix (Same Product, Same User)')
plot_relationship_heatmap(net_usu_matrix_amazon, 'U-S-U Matrix (Same Product, Same Star Rating)')
plot_relationship_heatmap(net_uvu_matrix_amazon, 'U-V-U Matrix (Same Product, Same Time)')
plot_relationship_heatmap(homo_matrix_amazon, 'Homogeneous Matrix (Combined Relationships)')
--- Visualizing Relationship Matrices (Amazon.mat) ---
ē“ę¹å¾ēå½¢ē¶-Amazon.mat¶
plot_connection_counts_histogram(net_upu_matrix_amazon, 'U-P-U Matrix (Amazon)')
plot_connection_counts_histogram(net_usu_matrix_amazon, 'U-S-U Matrix (Amazon)')
plot_connection_counts_histogram(net_uvu_matrix_amazon, 'U-V-U Matrix (Amazon)')
plot_connection_counts_histogram(homo_matrix_amazon, 'Homogeneous Matrix (Amazon)')
é结ē¹å¾ę°ę®-Amazon.mat¶
if features_matrix_amazon.shape[1] > 2:
svd = TruncatedSVD(n_components=2, random_state=42)
features_svd_amazon = svd.fit_transform(features_matrix_amazon)
plt.figure(figsize=(10, 8))
sns.scatterplot(x=features_svd_amazon[:, 0], y=features_svd_amazon[:, 1], hue=labels_amazon, palette='coolwarm', alpha=0.7, s=30)
plt.title('TruncatedSVD of Features, Colored by Fraud Label (Amazon)-åé”å', fontsize=16)
plt.xlabel(f'SVD Component 1 ({svd.explained_variance_ratio_[0]*100:.2f}% variance)', fontsize=12)
plt.ylabel(f'SVD Component 2 ({svd.explained_variance_ratio_[1]*100:.2f}% variance)', fontsize=12)
plt.legend(title='Fraud Label', loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
else:
print("Not enough features for SVD (less than 2 dimensions) for Amazon.")
if features_matrix_amazon.shape[0] > 1000 and features_matrix_amazon.shape[1] > 1:
print("\nDataset (Amazon) is large. t-SNE can be slow. Running on a sampled subset.")
np.random.seed(42)
sample_size_amazon = 5000
if features_matrix_amazon.shape[0] > sample_size_amazon:
sample_indices_amazon = np.random.choice(features_matrix_amazon.shape[0], sample_size_amazon, replace=False)
features_sample_amazon = features_matrix_amazon[sample_indices_amazon]
labels_sample_amazon = labels_amazon[sample_indices_amazon]
else:
features_sample_amazon = features_matrix_amazon
labels_sample_amazon = labels_amazon
print(f"Applying t-SNE on a sample of {features_sample_amazon.shape[0]} reviews from Amazon...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_jobs=-1)
features_sample_dense_amazon = features_sample_amazon.toarray()
features_tsne_amazon = tsne.fit_transform(features_sample_dense_amazon)
current_labels_amazon = labels_sample_amazon
plt.figure(figsize=(10, 8))
sns.scatterplot(x=features_tsne_amazon[:, 0], y=features_tsne_amazon[:, 1], hue=current_labels_amazon, palette='coolwarm', alpha=0.7, s=30)
plt.title('t-SNE of Features, Colored by Fraud Label (Amazon Sampled)-åé”å', fontsize=16)
plt.xlabel('t-SNE Component 1', fontsize=12)
plt.ylabel('t-SNE Component 2', fontsize=12)
plt.legend(title='Fraud Label', loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
else:
print("Not enough data points or features for meaningful t-SNE visualization for Amazon.")
Dataset (Amazon) is large. t-SNE can be slow. Running on a sampled subset. Applying t-SNE on a sample of 5000 reviews from Amazon...
!jupyter nbconvert --to html DataAnalysis.ipynb